import requests
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as px
import plotly.express as pxe
import plotly.io as pio
pio.renderers.default = 'notebook' #for PLOTY rendering in VSCODE, disable acording to env needs
import os
import glob
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time as ts
import datetime as dt
from datetime import *
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import utils
import warnings
warnings.filterwarnings('ignore')
For our crawling method we had to format the time signature accorind to our relevent need.
Therefore we created the function "convert24" which take an AM/PM string and converts it to 24hr format
def convert24(timeStr):
conv = timeStr
conv = datetime.strptime(conv, '%I:%M %p')
return "{:02d}:{:02d}".format(conv.hour, conv.minute)
We decided to use "Selenium webdriver" as our crawling method as it allowed us more control.
The first site "phaseMoon", was a tough one to handle because it contained no "classes" or "id" for any of it's properties.
To solve the issues we faced, we started to dive into the site HTML code and realized that he had a constent stacture of and the data it self was the only dynamic part.
Our solution was to create multiple arrays in side of one another, with each index containing a diffrent row of the data table.
By doing so we managed to accese each element in the data table and extract the data we needed
# Set Initial Dates
current_date = dt.date.today()
starting_date = dt.date(2008,1,1)
delta = dt.timedelta(days=1)
# Set Webdriver Patch and Driver it self
# Chnage webDriver directory according to your desktop
serv = Service("/Users/oranmor/Documents/chromedriver")
driver = webdriver.Chrome(service=serv)
with open('Data Frames/data_table.csv', 'w', newline='') as f:
# fieldName = ['Date', 'Illumination', 'Moon rise', 'Moon set', 'Length','Moon distance', 'Moon angle', 'Phase', 'Next full moon', 'Cycle age', 'Events', 'Site']
thewriter = csv.DictWriter(f, fieldnames=fieldName)
thewriter.writeheader()
while starting_date <= current_date:
url = 'https://phasesmoon.com/moonday{day}{month}{year}.html'.format(day = starting_date.day, month = starting_date.strftime("%B"), year = starting_date.year)
driver.get(url)
# Declare variable to write
Phase = ''
Illumination = ''
Moon_rise = ''
Moon_set = ''
Cycle_age = ''
Moon_angle = ''
moon_distance = ''
# Handaling Timeout events + getting/Users/oranmor/Documents/chromedriver values from site
try:
table = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "table"))
)
moonInfo = table.find_element(By.TAG_NAME,"tbody")
moonInfo = moonInfo.find_elements(By.TAG_NAME,"tr")
for index, tab in enumerate(moonInfo):
currData = tab.find_elements(By.TAG_NAME,"td")
data = currData[1]
if (index == 0):
Phase = data.text
if (index == 1):
Illumination = data.text.replace('% Visible', '')
if (index == 2):
Moon_rise = convert24(data.text[0:8])
Moon_set = convert24(data.text[11:])
if (index == 3):
Cycle_age = data.text.replace(' Days', '')
if (index == 4):
Moon_angle = data.text
if (index == 5):
moon_distance = data.text.replace('km', '')
except:
driver.quit()
# Adding data from the site to the CSV file itself
new_dt = starting_date.strftime("%d/%m/%Y")
thewriter.writerow({
'Date': new_dt,
'Illumination': Illumination,
'Moon rise': Moon_rise,
'Moon set': Moon_set,
'Length': '*',
'Moon distance': moon_distance,
'Moon angle': Moon_angle,
'Phase': Phase,
'Next full moon': '*',
'Cycle age': Cycle_age,
'Events': '*',
'Site': 'phasesmoon'
})
starting_date += delta
ts.sleep(0.5)
driver.quit()
After exploring the web, we found out there are a lot of sites that gave us diffrent data.
To handle the situation we decided to use the "Crawling" method on another site, "MoonGiant"
To this site we used the same crawling method as before with "Selenium Webdrive"
#return only the numbers in a string
def string_to_only_nums(string):
emp_str = ""
for m in string:
if m.isdigit():
emp_str = emp_str + m
return emp_str
#start google chrome
ser = Service("C:\Program Files (x86)\chromedriver.exe")
driver = webdriver.Chrome(service=ser)
counter = 0
#dates
current_date = dt.date.today()
starting_date = dt.date(2008,1,1)
delta = dt.timedelta(days=1)
#creates csv file
with open('Data Frames/data_table2.csv', 'a', newline='') as f:
fieldName = ['Date', 'Illumination', 'Moon rise', 'Moon set', 'Length','Moon distance', 'Moon angle', 'Phase','Next full moon', 'Cycle age', 'Events', 'Site']
thewriter = csv.DictWriter(f, fieldnames=fieldName)
thewriter.writeheader()
# while loop from starting date to current date
while starting_date <= current_date:
#changing the format of the date so it will fit the web sight
new_dt = starting_date.strftime("%m/%d/%Y")
url = "https://www.moongiant.com/phase/{}".format(new_dt)
driver.get(url)
counter+=1
try:
#devide the un needed text from the data
sight_data = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "moonDetails")))
sight_data = sight_data.find_elements_by_tag_name("span")
# sight_data = driver.find_element_by_id("moonDetails")
#Write to the csv file
with open('data_table2.csv', 'a', newline='') as f:
thewriter = csv.DictWriter(f, fieldnames=fieldName)
thewriter.writerow({
'Date': starting_date.strftime("%d/%m/%Y"),
'Illumination': string_to_only_nums(sight_data[1].text),
'Moon rise': '*',
'Moon set':'*',
'Length': '*',
'Moon distance':sight_data[4].text,
'Moon angle': sight_data[3].text,
'Phase': sight_data[0].text,
'Next full moon': '*',
'Cycle age': sight_data[2].text,
'Events': '*',
})
# time.sleep(1)
except:
driver.quit()
#incremanting the day by one
starting_date += delta
#exit web and file
f.close()
driver.quit()
After creating our dataframes we took a look to ses the initial data we collected.
By using some built-in functions in "Pandas" library we saw that we have extracted approximately 42,000 cells of data from "PhaseMoon" alone and that none of them is empty (2008 until june of 2022).
Because of that we can tell that no initial missing data cleanup is required
dfP = pd.read_csv('Data Frames/data_table.csv')
dfP.shape
(5289, 12)
dfP.isnull().sum()
Date 0 Illumination 0 Moon rise 0 Moon set 0 Length 0 Moon distance 0 Moon angle 0 Phase 0 Next full moon 0 Cycle age 0 Events 0 Site 0 dtype: int64
dfP.head()
| Date | Illumination | Moon rise | Moon set | Length | Moon distance | Moon angle | Phase | Next full moon | Cycle age | Events | Site | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 01/01/2008 | 44.84 | 0:25 | 12:02 | * | 401,804.18 | 0.50 | Waning crescent | * | 22.63 | * | phasesmoon |
| 1 | 02/01/2008 | 35.54 | 1:18 | 12:34 | * | 403,984.69 | 0.49 | Waning crescent | * | 23.53 | * | phasesmoon |
| 2 | 03/01/2008 | 26.82 | 2:10 | 13:10 | * | 405,219.35 | 0.49 | Waning crescent | * | 24.41 | * | phasesmoon |
| 3 | 04/01/2008 | 18.92 | 3:05 | 13:49 | * | 405,476.90 | 0.49 | Waning crescent | * | 25.30 | * | phasesmoon |
| 4 | 05/01/2008 | 12.09 | 4:00 | 14:33 | * | 404,759.39 | 0.49 | Waning crescent | * | 26.19 | * | phasesmoon |
dfP.tail()
| Date | Illumination | Moon rise | Moon set | Length | Moon distance | Moon angle | Phase | Next full moon | Cycle age | Events | Site | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5284 | 20/06/2022 | 63.63 | 0:00 | 11:10 | * | 374,432.28 | 0.53 | Waning gibbous | * | 20.85 | * | phasesmoon |
| 5285 | 21/06/2022 | 52.88 | 0:01 | 12:07 | * | 379,030.58 | 0.53 | Last quarter | * | 21.88 | * | phasesmoon |
| 5286 | 22/06/2022 | 42.34 | 0:34 | 13:02 | * | 383,852.70 | 0.52 | Waning crescent | * | 22.87 | * | phasesmoon |
| 5287 | 23/06/2022 | 32.41 | 1:06 | 13:56 | * | 388,629.06 | 0.51 | Waning crescent | * | 23.84 | * | phasesmoon |
| 5288 | 24/06/2022 | 23.43 | 1:38 | 14:50 | * | 393,115.36 | 0.51 | Waning crescent | * | 24.78 | * | phasesmoon |
dfG = pd.read_csv('Data frames/data_table2.csv')
dfG.shape
(5289, 12)
dfG.isnull().sum()
Date 0 Illumination 0 Moon rise 0 Moon set 0 Length 0 Moon distance 0 Moon angle 0 Phase 7 Next full moon 0 Cycle age 0 Events 0 Site 0 dtype: int64
dfG.head()
| Date | Illumination | Moon rise | Moon set | Length | Moon distance | Moon angle | Phase | Next full moon | Cycle age | Events | Site | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 01/01/2008 | 39 | * | * | * | 403,275.00 | 0.49 | Waning Crescent | * | 23.19 | * | MoonGiant |
| 1 | 02/01/2008 | 30 | * | * | * | 404,869.95 | 0.49 | Waning Crescent | * | 24.08 | * | MoonGiant |
| 2 | 03/01/2008 | 22 | * | * | * | 405,495.38 | 0.49 | Waning Crescent | * | 24.97 | * | MoonGiant |
| 3 | 04/01/2008 | 15 | * | * | * | 405,141.31 | 0.49 | Waning Crescent | * | 25.86 | * | MoonGiant |
| 4 | 05/01/2008 | 8 | * | * | * | 403,829.10 | 0.49 | Waning Crescent | * | 26.75 | * | MoonGiant |
dfG.tail()
| Date | Illumination | Moon rise | Moon set | Length | Moon distance | Moon angle | Phase | Next full moon | Cycle age | Events | Site | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5284 | 20/06/2022 | 57 | * | * | * | 377,265.07 | 0.53 | Waning Gibbous | * | 21.50 | * | MoonGiant |
| 5285 | 21/06/2022 | 46 | * | * | * | 382,035.67 | 0.52 | Last Quarter | * | 22.50 | * | MoonGiant |
| 5286 | 22/06/2022 | 36 | * | * | * | 386,859.52 | 0.51 | Waning Crescent | * | 23.48 | * | MoonGiant |
| 5287 | 23/06/2022 | 27 | * | * | * | 391,481.02 | 0.51 | Waning Crescent | * | 24.43 | * | MoonGiant |
| 5288 | 24/06/2022 | 18 | * | * | * | 395,678.44 | 0.50 | Waning Crescent | * | 25.36 | * | MoonGiant |
After getting both of our dataframes we needed to merge them.
For the merge, we decided to make an average out of the two DataFrames values and "inject" the data into a new "CSV" file
Every data that the "Moongiant" DataFrame was missing or created a conflict (like with "Moon phase" collum) we defined the "PhaseMoon" DataFrame to be the "master"
#merge 2 csv
joined_files = os.path.join("Data Frames","*.csv")
joined_list = glob.glob(joined_files)
merged_df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
#outPut new mergerd csv
merged_df.to_csv("Merged.csv", index=False)
row_names= ["Date","Illumination","Moon rise","Moon set","Length","Moon distance","Moon angle","Phase","Next full moon","Cycle age","Events","Site"]
tab2 = len(merged_df)/2
#returns time diff
def getLength(st,rs):
if(len(st) == 4):
st = "0"+st
if(len(rs) == 4):
rs = "0"+rs
dt_rise = float(f"{(rs)[0:2]}.{(rs)[3:5]}")
dt_set = float(f"{(st)[0:2]}.{(st)[3:5]}")
return round((24 - dt_set + dt_rise if dt_set > dt_rise else dt_rise - dt_set), 2)
#returns average
def getAverage(num,num2):
return round((float(num)+float(num2))/2, 3)
#returns Cycle average
def getICycleAverage(num,num2):
if(abs(num2 - num) > 20):
return num2
else:
return round((float(num)+float(num2))/2, 3)
#returns average with ","
def getAverageS(num,num2):
return round((float(num.replace(",",""))+float(num2.replace(",","")))/2, 3)
#returns the date of the next full moon
# def findNextFullMoon(i):
# while i < tab2+1 :
# if(merged_df.loc[i,row_names[7]] == "Full Moon" or merged_df.loc[i,row_names[7]] == "Full moon"):
# return merged_df.loc[i+tab2,row_names[0]]
# i=i+1
# return "None"
#returns the event in the date
def checkEvent(i):
if (merged_df.loc[i,row_names[7]] == "Full moon" or merged_df.loc[i,row_names[7]] == "Last Quarter" or merged_df.loc[i,row_names[7]] == "First Quarter" or merged_df.loc[i,row_names[7]] == "New Moon"):
return merged_df.loc[i,row_names[7]]
else:
return "None"
#creates csv file
with open('processed.csv', 'w', newline='') as f:
fieldName = ['Date', 'Illumination', 'Moon rise', 'Moon set', 'Length','Moon distance', 'Moon angle', 'Phase','Cycle age', 'Events']
thewriter = csv.DictWriter(f, fieldnames=fieldName)
thewriter.writeheader()
j = 1
for i in range(int(tab2)):
with open('processed.csv', 'a', newline='') as f:
thewriter = csv.DictWriter(f, fieldnames=fieldName)
thewriter.writerow({
'Date': merged_df.loc[i+tab2,row_names[0]],
'Illumination': getAverage(merged_df.loc[i+tab2,row_names[1]],merged_df.loc[i,row_names[1]]),
'Moon rise': merged_df.loc[i+tab2,row_names[2]],
'Moon set':merged_df.loc[i+tab2,row_names[3]],
'Length': getLength(merged_df.loc[i+tab2,row_names[3]],merged_df.loc[i+tab2,row_names[2]]),
'Moon distance': getAverageS(merged_df.loc[i+tab2,row_names[5]],merged_df.loc[i,row_names[5]]),
'Moon angle': getAverage(merged_df.loc[i+tab2,row_names[6]],merged_df.loc[i,row_names[6]]),
'Phase': merged_df.loc[i+tab2,row_names[7]],
'Cycle age': getICycleAverage(merged_df.loc[i+tab2,row_names[9]],merged_df.loc[i,row_names[9]]),
'Events': checkEvent(i+tab2),
})
df = pd.read_csv('processed.csv')
print(df.shape)
df.head()
(5289, 10)
| Date | Illumination | Moon rise | Moon set | Length | Moon distance | Moon angle | Phase | Cycle age | Events | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 01/01/2008 | 41.920 | 0:25 | 12:02 | 12.23 | 402539.590 | 0.495 | Waning crescent | 22.910 | None |
| 1 | 02/01/2008 | 32.770 | 1:18 | 12:34 | 12.84 | 404427.320 | 0.490 | Waning crescent | 23.805 | None |
| 2 | 03/01/2008 | 24.410 | 2:10 | 13:10 | 13.00 | 405357.365 | 0.490 | Waning crescent | 24.690 | None |
| 3 | 04/01/2008 | 16.960 | 3:05 | 13:49 | 13.56 | 405309.105 | 0.490 | Waning crescent | 25.580 | None |
| 4 | 05/01/2008 | 10.045 | 4:00 | 14:33 | 13.67 | 404294.245 | 0.490 | Waning crescent | 26.470 | None |
df.describe()
| Illumination | Length | Moon distance | Moon angle | Cycle age | |
|---|---|---|---|---|---|
| count | 5289.000000 | 5289.000000 | 5289.000000 | 5289.000000 | 5289.000000 |
| mean | 50.030936 | 12.027173 | 384974.931738 | 0.518072 | 14.453082 |
| std | 35.094871 | 1.115005 | 14902.928513 | 0.020899 | 8.497870 |
| min | 0.000000 | 9.320000 | 363354.600000 | 0.490000 | 0.000000 |
| 25% | 15.065000 | 11.110000 | 370062.510000 | 0.500000 | 7.095000 |
| 50% | 49.940000 | 12.020000 | 385508.230000 | 0.515000 | 14.445000 |
| 75% | 84.880000 | 12.930000 | 399876.365000 | 0.540000 | 21.830000 |
| max | 100.000000 | 14.670000 | 405458.665000 | 0.550000 | 29.235000 |
Considering that our research qeustion is based on the moon known behavior we immediately thought about comparing the data we had to the "Cyle age" collum as it represented an evolving and devolving momvent.
Here we compared the moon age in the cycle to the illumination level and saw how they effeted one another
plot = px.Figure(data=[px.Scatter(
x=df['Cycle age'],
y=df['Illumination'],
mode='markers',)
])
# Add dropdown
plot.update_layout(
updatemenus=[
dict(
type="buttons",
direction="left",
buttons=list([
dict(
args=["type", "scatter"],
label="Scatter Plot",
method="restyle"
),
dict(
args=["type", "bar"],
label="Bar Chart",
method="restyle"
)
]),
),
]
)
plot.update_layout(autotypenumbers='convert types')
plot.show()
After realizing there's a solid connection between the two collums we wanted to add another "layer" to the graph and see if the moon angle will also play a role.
Watching the results you can clearly see that here the "moon angle" is to diverced and not affected by the age of the cycle.Therefore seeing that the solid connection is only seen with "Cycle age" and "Illumination"¶
plot = px.Figure(data=[px.Scatter(
x=df['Cycle age'],
y=df['Moon angle'],
mode='markers',)
])
# Add dropdown
plot.update_layout(
updatemenus=[
dict(
type="buttons",
direction="left",
buttons=list([
dict(
args=["type", "scatter"],
label="Scatter Plot",
method="restyle"
),
dict(
args=["type", "bar"],
label="Bar Chart",
method="restyle"
)
]),
),
]
)
plot.update_layout(autotypenumbers='convert types')
plot.show()
fig = pxe.line_3d(df, x="Cycle age", y="Moon angle",z="Illumination", color="Cycle age")
fig.update_layout(autotypenumbers='convert types')
fig.show()
After seeing the last graphs we decided to compare "Moon angle" with "Moon distance" and see if they some sort of relation to one another
Afte viewing the graph we got we could clearly see a constant decending "line" that indicated a relation between the two collums data
fig = pxe.scatter(df, x="Moon distance", y="Moon angle")
fig.update_layout(autotypenumbers='convert types')
fig.show()
plot = px.Figure(data=[px.Scatter(
y=df['Moon distance'],
mode='lines',)
])
plot.update_layout(
xaxis=dict(
rangeselector=dict(
buttons=list([
dict(count=1,
step="day",
stepmode="backward"),
])
),
rangeslider=dict(
visible=True
),
)
)
plot.update_layout(autotypenumbers='convert types')
plot.show()
for i in range (df.shape[0]):
df.at[i,'Illumination'] = float(df.loc[i,'Illumination'])
df.at[i,'Moon angle'] = float(df.loc[i,'Moon angle'])
df.at[i,'Moon distance'] = float(df.loc[i,'Moon distance'])
df.at[i,'Cycle age'] = float(df.loc[i,'Cycle age'])
df.head()
| Date | Illumination | Moon rise | Moon set | Length | Moon distance | Moon angle | Phase | Cycle age | Events | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 01/01/2008 | 41.920 | 0:25 | 12:02 | 12.23 | 402539.590 | 0.495 | Waning crescent | 22.910 | None |
| 1 | 02/01/2008 | 32.770 | 1:18 | 12:34 | 12.84 | 404427.320 | 0.490 | Waning crescent | 23.805 | None |
| 2 | 03/01/2008 | 24.410 | 2:10 | 13:10 | 13.00 | 405357.365 | 0.490 | Waning crescent | 24.690 | None |
| 3 | 04/01/2008 | 16.960 | 3:05 | 13:49 | 13.56 | 405309.105 | 0.490 | Waning crescent | 25.580 | None |
| 4 | 05/01/2008 | 10.045 | 4:00 | 14:33 | 13.67 | 404294.245 | 0.490 | Waning crescent | 26.470 | None |
X = np.array(df['Moon angle']).reshape(-1,1)
y =np.array(df['Moon distance']).reshape(-1,1)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
model = LinearRegression()
model = model.fit(X_train,y_train)
acc = model.score(X_test, y_test)
print(f"the accuracy score of predicting 'Moon Distance' using 'Moon angle' is -> {acc}")
print(f"Moon distance when moon angle is at 0.47 -> {model.predict([[0.47]])[0][0]}")
the accuracy score of predicting 'Moon Distance' using 'Moon angle' is -> 0.9917999511059503 Moon distance when moon angle is at 0.47 -> 419100.0574299945
from turtle import color
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
x = np.array(df.head(50)['Cycle age']). reshape(-1,1).ravel()
y = np.array(df.head(50)['Illumination']).reshape(-1,1).ravel()
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)
mymodel = np.poly1d(np.polyfit(x_train,y_train,4))
myline = np.linspace(1,22,100)
plt.figure(facecolor='#ffffff')
ax = plt.axes()
ax.set_facecolor("#afafaf")
plt.scatter(x_train,y_train)
plt.plot(myline,mymodel(myline), color='red')
plt.show()
print(f"the accuracy score of predicting 'Illumination' using 'Cycle age' is -> {r2_score(y_test,mymodel(x_test))}")
print(f"Illumination when cycle age is at 6.5 -> {mymodel(6.5)}")
the accuracy score of predicting 'Illumination' using 'Cycle age' is -> 0.9977996724258708 Illumination when cycle age is at 6.5 -> 41.65252306672311
from sklearn.preprocessing import PolynomialFeatures
Xa = np.array(df['Cycle age']).reshape(-1,1)
Ya = np.array(df['Illumination'])
Xa_train, Xa_test, Ya_train, Ya_test = train_test_split(Xa,Ya,test_size=0.2)
polli = PolynomialFeatures(degree=3)
xa_poly = polli.fit_transform(Xa_train)
polli.fit(Xa_train,Ya_train)
modall = LinearRegression()
modall.fit(xa_poly, Ya_train)
y_pred = modall.predict(polli.fit_transform(Xa_test))
print(f"our score is {r2_score(Ya_test,y_pred)}, but we can see that when we're reaching the edge of the cycle we're starting to get some weird values such as the next situation:")
print(f"when setting the cycle age to 28 the result we're getting is -> {modall.predict(polli.fit_transform([[28]]))[0]}, this is situation is not good because the prediction is invalid")
our score is 0.9226166818708854, but we can see that when we're reaching the edge of the cycle we're starting to get some weird values such as the next situation: when setting the cycle age to 28 the result we're getting is -> -7.382200053681487, this is situation is not good because the prediction is invalid
Xa = np.array(df['Cycle age']).reshape(-1,1)
Ya = np.array(df['Illumination'])
Xa_train, Xa_test, Ya_train, Ya_test = train_test_split(Xa,Ya,test_size=0.2)
polli = PolynomialFeatures(degree=4)
xa_poly = polli.fit_transform(Xa_train)
polli.fit(Xa_train,Ya_train)
modall = LinearRegression()
modall.fit(xa_poly, Ya_train)
y_pred = modall.predict(polli.fit_transform(Xa_test))
print(f"After raising our degree to 4 we can see that our score is now {r2_score(Ya_test,y_pred)}, at this point we can look what's happening in the same situation as before:")
print(f"when setting the cycle age to 28 the result we're getting is -> {modall.predict(polli.fit_transform([[28]]))[0]}, this is situation is now valid and the prediction is mathes the csv data we have")
After raising our degree to 4 we can see that our score is now 0.9993513969105874, at this point we can look what's happening in the same situation as before: when setting the cycle age to 28 the result we're getting is -> 2.360911812053791, this is situation is now valid and the prediction is mathes the csv data we have
from sklearn.ensemble import RandomForestRegressor
X = df.head(100)[['Cycle age', 'Moon distance', 'Moon angle']]
y = df.head(100)['Illumination']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
# Train Regression models
rF = RandomForestRegressor(random_state=1)
rF.fit(X_train, y_train)
rF_pred = rF.predict(X_test)
some = np.array(y_test).reshape(-1,1).ravel()
print(f"the accuracy score of predicting 'Illumination' using 'Cycle age' 'Moon distance' and 'Moon angle' is -> {r2_score(y_test,rF_pred)}")
plt.figure(facecolor='#ffffff')
ax = plt.axes()
ax.set_facecolor("#afafaf")
plt.xlabel("Sample amount")
plt.ylabel('Illumination precent')
plt.plot(some, linestyle='dashed', color='#000', linewidth=6.0)
plt.plot(rF_pred, linestyle='solid',color='#adefca', linewidth=3.0)
plt.show()
the accuracy score of predicting 'Illumination' using 'Cycle age' 'Moon distance' and 'Moon angle' is -> 0.9986941630965762
# based on : https://www.subsystems.us/uploads/9/8/9/4/98948044/moonphase.pdf
# Note that this calculation only works from 2000/01/07 due to moon recalculated speed
userInput = input('Enter date (yyyy/mm/dd) : ')
format = '%Y/%m/%d'
dt_Input = datetime.strptime(userInput, format)
print(dt_Input.date())
A = int(dt_Input.year / 100)
B = int(A / 4)
C = 2- A + B
E = int(365.25 * (dt_Input.year + 4716))
F = int(30.6001 * (dt_Input.month + 1))
JD = C + dt_Input.day + E + F - 1524.5
daySinceNew = JD - 2451549.5
newMoons = daySinceNew / 29.53
moonCycle = (newMoons - int(newMoons)) * 29.53
print("Moon cycle age = ",moonCycle)
2024-07-07 Moon cycle age = 1.4099999999994601
import plotly
plotly.offline.init_notebook_mode()